Code
library(leaps)
n <- 10000
sim <- function(i) {
set.seed(i)
data <- tibble(
x1 = rnorm(n),
x2 = rnorm(n),
x3 = rnorm(n),
x4 = rnorm(n),
x5 = rnorm(n),
x6 = rnorm(n),
x7 = rnorm(n),
x8 = rnorm(n),
x9 = rnorm(n),
x10 = rnorm(n),
x11 = rnorm(n),
x12 = rnorm(n),
x13 = rnorm(n),
x14 = rnorm(n),
x15 = rnorm(n),
x16 = rnorm(n),
x17 = rnorm(n),
x18 = rnorm(n),
x19 = rnorm(n),
x20 = rnorm(n),
y = 0.2 * x1 + 0.2 * x2 + 0.2 * x3 + rnorm(n)
)
mod1 <- lm(y ~ ., data)
mod2 <- regsubsets(y ~ ., data)
significant_vars_mod1 <- summary(mod1)$coefficients[,4] < 0.05
mod2_summary <- summary(mod2, all.best = TRUE)
best_model <- which.min(mod2_summary$bic)
best_subset_vars <- mod2_summary$which[best_model,][-1] # Exclude intercept
var_names <- names(significant_vars_mod1)[-1] # Exclude intercept
data_frame_result <- tibble(
variable = var_names,
significant_in_mod1 = significant_vars_mod1[-1], # Exclude intercept
in_best_subset_mod2 = best_subset_vars,
seed = i
)
data_frame_result
}
x <- map_df(1:1000, sim)
x |>
group_by(variable) |>
summarise(prop_sig = mean(significant_in_mod1))# A tibble: 20 × 2
variable prop_sig
<chr> <dbl>
1 x1 1
2 x10 0.059
3 x11 0.04
4 x12 0.058
5 x13 0.057
6 x14 0.046
7 x15 0.054
8 x16 0.058
9 x17 0.041
10 x18 0.045
11 x19 0.06
12 x2 1
13 x20 0.052
14 x3 1
15 x4 0.054
16 x5 0.043
17 x6 0.053
18 x7 0.068
19 x8 0.047
20 x9 0.037